import numpy as np
import geopandas as gpd
import pandas as pd
import matplotlib.pyplot as plt
import folium
import seaborn as sns
import json
import plotly.express as px
from itertools import combinations
from collections import defaultdict
from folium.plugins import HeatMap
from matplotlib import font_manager
from folium.plugins import MarkerCluster
from folium import GeoJson
from folium import LinearColormap
from mlxtend.preprocessing import TransactionEncoder
from mlxtend.frequent_patterns import apriori
from mlxtend.frequent_patterns import association_rules
from sklearn.preprocessing import StandardScaler
from matplotlib import gridspec
font_path = "C:/Windows/Fonts/malgun.ttf"
font_family = font_manager.FontProperties(fname=font_path).get_name()
plt.rcParams["font.family"] = font_family
plt.rcParams["axes.unicode_minus"] = False
# plt.rc("font", family="Malgun Gothic")
pd.options.display.max_rows = 1000
%matplotlib inline
%%HTML
<script src="require.js"></script>
REGIONS_INFO = {
"BR1": "AMERICAS",
"EUN1": "EUROPE",
"EUW1": "EUROPE",
"JP1": "ASIA",
"KR": "ASIA",
"LA1": "AMERICAS",
"LA2": "AMERICAS",
"NA1": "AMERICAS",
"OC1": "SEA",
"PH2": "SEA",
"RU": "EUROPE",
"SG2": "SEA",
"TH2": "SEA",
"TR1": "EUROPE",
"TW2": "SEA",
"VN2": "SEA",
}
player = pd.read_csv("../data/0819/player.csv")
player_stat = pd.read_csv("../data/0819/top10_player.csv")
match = pd.read_csv("../data/0819/match.csv")
match_player = pd.read_csv("../data/0819/match_player.csv")
match_trait = pd.read_csv("../data/0819/match_trait.csv")
match_unit = pd.read_csv("../data/0819/match_unit.csv")
match.groupby(['version_major', 'version_minor', 'version_patch']).count()
| match_id | match_date | match_length | version_date | tft_set_number | |||
|---|---|---|---|---|---|---|---|
| version_major | version_minor | version_patch | |||||
| 13 | 14 | 522 | 39 | 39 | 39 | 39 | 39 |
| 15 | 523 | 45 | 45 | 45 | 45 | 45 | |
| 524 | 1332 | 1332 | 1332 | 1332 | 1332 | ||
| 16 | 525 | 1456 | 1456 | 1456 | 1456 | 1456 |
VERSION_MAJOR = match['version_major'].max()
VERSION_MINOR = match['version_minor'].max()
VERSION_PATCH = match['version_patch'].max()
print(VERSION_MAJOR, VERSION_MINOR, VERSION_PATCH)
13 16 525
file_path = "../json/ko_kr.json"
with open(file_path, "r") as json_file:
data_dragon = json.load(json_file)
# 아이템, 유닛, 특성 정보의 데이터프레임
items = pd.DataFrame(data_dragon["items"])
set9 = pd.DataFrame(data_dragon["sets"])["9"]
set9_units = pd.DataFrame(set9["champions"])
set9_traits = pd.DataFrame(set9["traits"])
# 각 데이터프레임으로부터 딕셔너리 생성
item_names = dict(zip(items["apiName"].str.lower(), items["name"].str.lower()))
unit_names = dict(
zip(set9_units["apiName"].str.lower(), set9_units["name"].str.lower())
)
trait_names = dict(
zip(set9_traits["apiName"].str.lower(), set9_traits["name"].str.lower())
)
match_trait["name"] = match_trait["name"].apply(lambda x: trait_names[x.lower()])
match_unit["name"] = match_unit["name"].apply(lambda x: unit_names[x.lower()])
match_unit["item1"] = match_unit["item1"].apply(
lambda x: item_names[x.lower()] if isinstance(x, str) else ""
)
match_unit["item2"] = match_unit["item2"].apply(
lambda x: item_names[x.lower()] if isinstance(x, str) else ""
)
match_unit["item3"] = match_unit["item3"].apply(
lambda x: item_names[x.lower()] if isinstance(x, str) else ""
)
def calculate_fence(data, coef=1.5):
q1 = np.percentile(data, 25)
q3 = np.percentile(data, 75)
iqr = q3 - q1
lower_fence = q1 - coef * iqr
upper_fence = q3 + coef * iqr
return lower_fence, upper_fence
# 게임을 제대로 플레이 하지 않은 경기 확인
fig = plt.figure(figsize=(10, 5))
fig.suptitle("게임 길이 기준 이상치 검출")
ax1 = fig.add_subplot(1, 2, 1)
ax2 = fig.add_subplot(1, 2, 2)
sns.boxplot(match_player, y="last_round", color="skyblue", ax=ax1)
sns.boxplot(match_player, y="time_eliminated", color="skyblue", ax=ax2)
fig.tight_layout()
plt.show()
last_round_lfence, last_round_ufence = calculate_fence(match_player["last_round"])
time_eliminated_lfence, time_eliminated_ufence = calculate_fence(
match_player["time_eliminated"]
)
outlier_condition = (
(match_player["last_round"] > last_round_ufence)
| (match_player["last_round"] < last_round_lfence)
| (match_player["time_eliminated"] > time_eliminated_ufence)
| (match_player["time_eliminated"] < time_eliminated_lfence)
)
outlier_match_player = match_player[outlier_condition]
# 유닛을 한마리도 뽑지 않은 플레이어가 존재하기 때문에 left 조인
outlier_match_player = outlier_match_player.merge(
match_unit, how="left", left_on="match_player_id", right_on="match_player_id"
)
outlier_match_player = outlier_match_player.loc[
:,
[
"match_player_id",
"last_round",
"level",
"placement",
"time_eliminated",
"name",
"tier",
],
]
outlier_match_player = outlier_match_player.groupby(by="match_player_id").agg(
{
"last_round": "first",
"level": "first",
"placement": "first",
"time_eliminated": "first",
"name": list,
"tier": "mean",
}
)
# t-헥스를 뽑은 플레이어 제외
outlier_player = outlier_match_player[
outlier_match_player["name"].apply(
lambda champ_list: True if "t-헥스" not in champ_list else False
)
]
# 결과 반영
# outlier_player
# 이상치 플레이어 데이터 제거, 같이 플레이 한 플레이어들의 데이터는 유지
preprocessed_match_player = match_player[
~match_player["match_player_id"].isin(outlier_player.index)
]
preprocessed_match_unit = match_unit[
~match_unit["match_player_id"].isin(outlier_player.index)
]
preprocessed_match_trait = match_trait[
~match_trait["match_player_id"].isin(outlier_player.index)
]
# 게임을 제대로 플레이 하지 않은 경기 확인
fig = plt.figure(figsize=(10, 5))
fig.suptitle("게임 길이 기준 이상치 검출")
ax1 = fig.add_subplot(1, 2, 1)
ax2 = fig.add_subplot(1, 2, 2)
sns.boxplot(preprocessed_match_player, y="last_round", color="skyblue", ax=ax1)
sns.boxplot(preprocessed_match_player, y="time_eliminated", color="skyblue", ax=ax2)
fig.tight_layout()
plt.show()
# preprocessed_match_unit = preprocessed_match_unit.loc[:,['match_player_id', 'name', 'rarity', 'tier']]
# 하이머딩거 포탑은 코스트가 없기 때문에 제외
preprocessed_match_unit = preprocessed_match_unit[
preprocessed_match_unit["name"] != "최첨단 포탑"
]
# 특수 유닛 제외
preprocessed_match_unit = preprocessed_match_unit[
preprocessed_match_unit["rarity"] <= 6
]
# 유닛 코스트를 계산하기 위한 전처리를 수행
rarity_to_cost = {0: 1, 1: 2, 2: 3, 4: 4, 6: 5}
preprocessed_match_unit["tier"] = preprocessed_match_unit["tier"].apply(
lambda x: 3 if x >= 4 else x
)
preprocessed_match_unit["single_cost"] = preprocessed_match_unit.loc[:, "rarity"].apply(
lambda x: rarity_to_cost[x]
)
preprocessed_match_unit["cost"] = preprocessed_match_unit.loc[:, "single_cost"] * (
3 ** (preprocessed_match_unit.loc[:, "tier"] - 1)
)
# 데이터 가공
revised_match_unit = preprocessed_match_unit.groupby(by="match_player_id").agg(
{"name": list, "cost": "sum", "tier": "mean"}
)
revised_match_unit["unit_count"] = revised_match_unit["name"].apply(lambda x: len(x))
revised_match_unit = revised_match_unit.merge(
preprocessed_match_player.loc[
:, ["match_player_id", "placement", "last_round", "level", "time_eliminated"]
],
how="inner",
left_on="match_player_id",
right_on="match_player_id",
)
revised_match_unit.rename(
columns={"cost": "cost_sum", "tier": "avg_tier"}, inplace=True
)
# revised_match_unit[['cost_sum', 'avg_tier', 'unit_count', 'placement']].head(24)
fig = px.box(
revised_match_unit,
y="cost_sum",
title="유닛 비용의 합 기준 이상치 검출",
color_discrete_sequence=["#58F"],
template="plotly",
)
fig
outlier_player = revised_match_unit[revised_match_unit["cost_sum"] <= 18][
"match_player_id"
]
# 이상치 플레이어 데이터 제거, 같이 플레이 한 플레이어들의 데이터는 유지
print(
"제거 전:",
preprocessed_match_player.shape,
preprocessed_match_unit.shape,
preprocessed_match_trait.shape,
revised_match_unit.shape,
)
preprocessed_match_player = preprocessed_match_player[
~preprocessed_match_player["match_player_id"].isin(outlier_player)
]
preprocessed_match_unit = preprocessed_match_unit[
~preprocessed_match_unit["match_player_id"].isin(outlier_player)
]
preprocessed_match_trait = preprocessed_match_trait[
~preprocessed_match_trait["match_player_id"].isin(outlier_player)
]
revised_match_unit = revised_match_unit[
~revised_match_unit["match_player_id"].isin(outlier_player)
]
print(
"제거 후:",
preprocessed_match_player.shape,
preprocessed_match_unit.shape,
preprocessed_match_trait.shape,
revised_match_unit.shape,
)
제거 전: (22876, 7) (184304, 10) (245399, 7) (22864, 9) 제거 후: (22840, 7) (184177, 10) (245202, 7) (22828, 9)
fig = px.box(
revised_match_unit,
y="cost_sum",
title="유닛 비용의 합 기준 이상치 검출",
color_discrete_sequence=["#58F"],
template="plotly",
)
fig
outlier_player = revised_match_unit[(revised_match_unit["unit_count"] < 5)][
"match_player_id"
]
# 이상치 플레이어 데이터 제거, 같이 플레이 한 플레이어들의 데이터는 유지
print(
"제거 전:",
preprocessed_match_player.shape,
preprocessed_match_unit.shape,
preprocessed_match_trait.shape,
revised_match_unit.shape,
)
preprocessed_match_player = preprocessed_match_player[
~preprocessed_match_player["match_player_id"].isin(outlier_player)
]
preprocessed_match_unit = preprocessed_match_unit[
~preprocessed_match_unit["match_player_id"].isin(outlier_player)
]
preprocessed_match_trait = preprocessed_match_trait[
~preprocessed_match_trait["match_player_id"].isin(outlier_player)
]
revised_match_unit = revised_match_unit[
~revised_match_unit["match_player_id"].isin(outlier_player)
]
print(
"제거 후:",
preprocessed_match_player.shape,
preprocessed_match_unit.shape,
preprocessed_match_trait.shape,
revised_match_unit.shape,
)
제거 전: (22840, 7) (184177, 10) (245202, 7) (22828, 9) 제거 후: (22799, 7) (184035, 10) (245046, 7) (22787, 9)
SERVER_INFO = {
"BR1": ("Brazil",),
"EUN1": ("Sweden", "Norway", "Estonia", "Latvia"),
"EUW1": ("Spain", "United Kingdom", "Belgium"),
"JP1": ("Japan",),
"KR": ("Korea",),
"LA1": ("Mexico", "Columbia", "Peru"),
"LA2": ("Bolivia", "Uruguay", "Chile"),
"NA1": ("United States", "Canada"),
"OC1": ("Australia", "New Zealand"),
"PH2": ("Philippines",),
"RU": ("Russia",),
"SG2": ("Singapore", "Malaysia", "Indonesia"),
"TH2": ("Thailand",),
"TR1": ("Turkey",),
"TW2": ("Taiwan", "Hong Kong", "Macao"),
"VN2": ("Vietnam",),
}
revised_match = match.copy()
revised_match['region'] = revised_match['match_id'].apply(lambda x: x.split("_")[0])
revised_match['continent'] = revised_match['region'].apply(lambda x: REGIONS_INFO[x])
revised_match['date'] = pd.to_datetime(revised_match['match_date'].apply(lambda x: x.split(" ")[0]))
revised_match['timestamp'] = revised_match['date'].apply(lambda x: x.timestamp())
player_count = revised_match.merge(match_player, how='inner', left_on = 'match_id', right_on = 'match_id')
player_count = player_count.drop_duplicates('puuid')
player_count = player_count.loc[:, ['continent', 'region', 'puuid']]
player_count = player_count.groupby(['continent', 'region']).count()
player_count = player_count.reset_index().sort_values("puuid", ascending=True)
player_count.rename(columns={"puuid": "count"}, inplace=True)
# 새로운 데이터를 담을 리스트 생성
new_data = []
# 각 지역의 국가 정보를 확장하여 리스트에 추가
for region, puuid in player_count.loc[:, ["region", "count"]].to_numpy():
countries = SERVER_INFO.get(region, ())
for country in countries:
new_data.append((region, puuid, country))
# 새로운 데이터프레임 생성
country_player_count = pd.DataFrame(new_data, columns=["region", "count", "country"])
fig = px.bar(
player_count,
x="region",
y="count",
text_auto=".3s",
color="continent",
title="서버별 사용자 수",
color_discrete_sequence=px.colors.sequential.Turbo[1::1],
)
fig.update_traces(
textfont_size=12, textangle=0, textposition="outside", cliponaxis=False
)
fig.show()
# GeoJSON 파일 URL
url = "http://geojson.xyz/naturalearth-3.3.0/ne_50m_admin_0_countries.geojson"
# GeoJSON 파일을 GeoDataFrame으로 읽어옴
countries_geo = gpd.read_file(url)
# 기본 맵을 생성
map = folium.Map(location=(30, 10), zoom_start=2, tiles="cartodb positron")
bins = np.linspace(
country_player_count["count"].min(), country_player_count["count"].max(), 16
)
# Choropleth Layer
choropleth_layer = folium.Choropleth(
geo_data=countries_geo,
data=country_player_count,
columns=["country", "count"],
key_on="feature.properties.name",
fill_color="Blues",
fill_opacity=0.7,
line_opacity=0.3,
nan_fill_color="white",
legend_name="Player count",
bins=bins,
)
choropleth_layer.add_to(map)
# 국가로 인덱스된 데이터프레임을 생성
player_data_indexed = country_player_count.set_index("country")
# 툴팁에 표시할 내용을 geojson.data에 추가
for s in choropleth_layer.geojson.data["features"]:
region_name = s["properties"]["name"]
if region_name not in player_data_indexed.index:
count_value = ""
server_value = ""
else:
count_value = str(
player_data_indexed.loc[region_name, "count"]
) # Convert to str
server_value = str(
player_data_indexed.loc[region_name, "region"]
) # Convert to str
s["properties"]["count"] = count_value
s["properties"]["server"] = server_value
tooltip = folium.GeoJsonTooltip(fields=["name", "server", "count"], labels=True)
choropleth_layer.geojson.add_child(tooltip)
folium.TileLayer("openstreetmap").add_to(map)
folium.LayerControl().add_to(map)
map
# 서브플롯 설정
fig, axes = plt.subplots(nrows=4, ncols=4, figsize=(12, 10))
sns.set_theme(style='whitegrid', font=font_family)
# 각 region 별로 히스토그램 그리기
for (region, ax) in zip(revised_match['region'].unique(), axes.flatten()):
sns.boxplot(data=revised_match[revised_match['region'] == region], y='timestamp', color='skyblue', ax=ax)
ax.set_ylim(revised_match['timestamp'].min()*0.9999, revised_match['timestamp'].max()*1.0001)
ax.set_title(f'Region {region}')
# 레이아웃 조정
plt.tight_layout()
plt.show()
top10_player = player_stat.loc[:, ['puuid']]
top10_player_match = top10_player.merge(match_player, how='inner', left_on='puuid', right_on = 'puuid')
# top10_player_match = top10_player_match.drop_duplicates('match_id')
top10_player_match = top10_player_match.merge(player, how='inner', left_on='puuid', right_on='puuid')
top10_player_match = top10_player_match.merge(revised_match, how='inner', left_on='match_id', right_on='match_id')
top10_player_match_reg = top10_player_match[top10_player_match['region_x']=='NA1']
# 서브플롯 설정
fig, axes = plt.subplots(nrows=2, ncols=5, figsize=(10, 5))
sns.set_theme(style='whitegrid', font=font_family)
# 각 region 별로 히스토그램 그리기
for (name, ax) in zip(top10_player_match_reg['name'].unique(), axes.flatten()):
sns.boxplot(data=top10_player_match_reg[top10_player_match_reg['name'] == name], y='timestamp', color='skyblue', ax=ax)
ax.set_ylim(top10_player_match['timestamp'].min()*0.9999, top10_player_match['timestamp'].max()*1.0001)
ax.set_title(f'{name}', fontsize=9)
ax.set_yticklabels("")
# 레이아웃 조정
plt.tight_layout()
plt.show()
# 전체 티어가 1인 특성과 현제 티어가 0인 특성 제외
revised_match_trait = preprocessed_match_trait[preprocessed_match_trait["tier_total"] > 1]
revised_match_trait = revised_match_trait[revised_match_trait["tier_current"] > 0]
revised_match_trait = revised_match_trait.groupby("match_player_id")[["name"]].agg(list)
# 버전 정보를 얻기 위해서 병합 작업을 수행
merged_match_trait = match.merge(
preprocessed_match_player, how="inner", left_on="match_id", right_on="match_id"
)
merged_match_trait = merged_match_trait.merge(
revised_match_trait,
how="inner",
left_on="match_player_id",
right_on="match_player_id",
)
# 가장 최신 버전에 해당하는 기록만 남김
latest_match_trait = merged_match_trait[
(merged_match_trait["version_major"] == VERSION_MAJOR)
& (merged_match_trait["version_minor"] == VERSION_MINOR)
& (merged_match_trait["version_patch"] == VERSION_PATCH)
]
latest_match_trait = merged_match_trait.loc[:, ["match_player_id", "name"]]
# 전처리 후 지지도 계산
te = TransactionEncoder()
te_result = te.fit_transform(latest_match_trait["name"])
td_df = pd.DataFrame(te_result, columns=te.columns_)
freq_items = apriori(td_df, min_support=0.05, use_colnames=True)
# freq_items.sort_values('support', ascending=False).head(15)
# 연관규칙 분석
rules = association_rules(freq_items, metric="confidence", min_threshold=0.8)
rules["antecedents_str"] = rules["antecedents"].apply(lambda x: ",".join(list(x)))
rules["consequents_str"] = rules["consequents"].apply(lambda x: ",".join(list(x)))
# rules.head()
# 점 그래프 생성
fig = px.scatter(
rules,
x="support",
y="confidence",
size="lift",
color="lift",
custom_data=["antecedents_str", "consequents_str", "lift"],
color_continuous_scale=px.colors.sequential.Jet,
title="특성 조합 연관규칙",
labels={"support": "지지도", "confidence": "신뢰도", "lift": "향상도"},
)
# 툴팁 추가
fig.update_traces(
hovertemplate="<br>".join(
[
"<b>조건</b>: %{customdata[0]}",
"<b>결과</b>: %{customdata[1]}<br>",
"<b>지지도</b>: %{x:.3%}",
"<b>신뢰도</b>: %{y:.3%}",
"<b>향상도</b>: %{customdata[2]:.4f}",
]
)
)
# 그래프 출력 사이즈 조절
fig.update_layout(
width=1100, # 너비
height=600, # 높이
hovermode='closest' # 가장 가까운 데이터 포인트의 툴팁 표시
)
# 그래프 표시
fig.show()
# 신뢰도 기준 0으로 설정 후 계산
rules_conf0 = association_rules(freq_items, metric="confidence", min_threshold=0)
rules_conf0["antecedents_str"] = rules_conf0["antecedents"].apply(
lambda x: ",".join(list(x))
)
rules_conf0["consequents_str"] = rules_conf0["consequents"].apply(
lambda x: ",".join(list(x))
)
# 조건절과 결과절의 개수가 1인 것만 추출
single_condition = (rules_conf0["antecedents"].apply(len) == 1) & (
rules_conf0["consequents"].apply(len) == 1
)
rules_single = rules_conf0[single_condition].loc[
:, ["antecedents_str", "consequents_str", "confidence"]
]
# 히트맵에 사용될 특성
target_trait = sorted(rules_single["antecedents_str"].unique())
# 히트맵에 사용할 데이터 프레임 생성
revised_rules_df = pd.DataFrame(index=target_trait)
# 대상 특성 순회
for col in target_trait:
# 열 설정을 위한 빈 리스트 생성 : row -> col의 신뢰도를 저장합니다.
col_conf = []
for row in target_trait:
confidence = rules_single[
(rules_single["antecedents_str"] == row)
& (rules_single["consequents_str"] == col)
]["confidence"]
# confidence값이 없을 경우는 0으로 설정
if confidence.shape[0] > 0:
col_conf.append(confidence.values[0])
else:
col_conf.append(0)
revised_rules_df[col] = col_conf
fig = plt.figure(figsize=(15, 10))
fig.suptitle("각 특성의 신뢰도", x=0.45)
ax = fig.add_subplot(1, 1, 1)
sns.heatmap(
revised_rules_df,
annot=True,
ax=ax,
cmap=sns.color_palette("Blues", as_cmap=True),
linewidths=0.01,
)
plt.tight_layout()
plt.show()